Overview

> library(ggplot2)
> library(reshape)
> library(gplots)
> library(edgeR)
> library(CHBUtils)
> library(pheatmap)
> library(knitr)
> library(DESeq2)
> project_summary = "../project-summary.csv"
> counts_file = "../combined.counts"
> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", 
+     "#D55E00", "#CC79A7")
> summarydata = read.table(project_summary, header = TRUE, sep = ",")
> rownames(summarydata) = summarydata$Name
> summarydata = summarydata[order(summarydata$Name), ]
> counts = read.table(counts_file, header = TRUE, row.names = "id")
> counts = counts[, order(colnames(counts))]
> # this is a list of all non user-supplied metadata columns that could appear
> known_columns = c("Name", "X.GC", "Exonic.Rate", "Sequences.flagged.as.poor.quality", 
+     "rRNA.rate", "Fragment.Length.Mean", "Intronic.Rate", "Intergenic.Rate", 
+     "Mapping.Rate", "Quality.format", "Duplication.Rate.of.Mapped", "Mapped", 
+     "rRNA", "Sequence.length", "Transcripts.Detected", "Mean.Per.Base.Cov.", 
+     "Genes.Detected", "Unique.Starts.Per.Read", "unique_starts_per_read", "complexity")
> batch2 = read.table("../batch_2_samples.txt", sep = "\t")
> batch2$batch = 2
> m = merge(summarydata, batch2, by.x = "Name", by.y = "V1", all.x = TRUE)
> m$batch[is.na(m$batch)] = 1
> summarydata = m
> summarydata = summarydata[, !colnames(summarydata) %in% c("species", "time")]
> summarydata$tissue_status = as.factor(paste(summarydata$tissue, summarydata$status, 
+     sep = "_"))
> rownames(summarydata) = summarydata$Name
> get_heatmap_fn = function(summarydata) {
+     # return the pheatmap function with or without metadata
+     metadata = summarydata[, !colnames(summarydata) %in% known_columns, drop = FALSE]
+     metadata = cbind(metadata, summarydata[, c("Mapping.Rate", "X.GC")])
+     if (ncol(metadata) == 0) {
+         return(pheatmap)
+     } else {
+         rownames(metadata) = summarydata$Name
+         heatmap_fn = function(data, ...) {
+             pheatmap(data, annotation = metadata, ...)
+         }
+         return(heatmap_fn)
+     }
+ }
> heatmap_fn = get_heatmap_fn(summarydata)

Quality control metrics

Summary statistics of summarydata

> numeric_columns = c("X.GC", "Exonic.Rate", "rRNA.rate", "Intronic.Rate", "Intergenic.Rate", 
+     "Mapping.Rate", "Mapped", "Fragment.Length.Mean", "Genes.Detected", "Transcripts.Detected")
> sumdf = data.frame(summary(summarydata[, numeric_columns]))
> 
> 
> ## Mapped reads
> ggplot(summarydata, aes(x = Name, y = Mapped)) + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     geom_bar(stat = "identity") + ylab("mapped reads") + xlab("")

Genomic mapping rate

> ggplot(summarydata, aes(x = Name, y = Mapping.Rate)) + geom_bar(stat = "identity") + 
+     ylab("mapping rate") + xlab("") + theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90))

Number of genes detected

> dd = data.frame(Name = names(counts), Genes.Detected = colSums(counts > 0))
> ggplot(dd, aes(x = Name, y = Genes.Detected)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("genes detected") + 
+     xlab("")

Exonic mapping rate

> ggplot(summarydata, aes(x = Name, y = Exonic.Rate)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("exonic mapping rate") + 
+     xlab("")

rRNA mapping rate

> ggplot(summarydata, aes(x = Name, y = rRNA.rate)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("rRNA rate") + 
+     xlab("")

Estimated fragment length of paired-end reads

> ggplot(summarydata, aes(x = Name, y = Fragment.Length.Mean)) + geom_bar(stat = "identity") + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + ylab("fragment length") + 
+     xlab("")

Boxplot of log10 counts per gene

> melted = melt(counts)
> colnames(melted) = c("sample", "count")
> melted$sample = factor(melted$sample)
> melted$sample = reorder(melted$sample, colnames(counts))
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Boxplot of log10 TMM-normalized counts per gene

Trimmed mean of M-values (TMM) normalization is described here

Robinson, M. D., & Oshlack, A. (2010). A scaling normalization method for differential expression analysis of RNA-seq data. Genome Biology, 11(3). doi:10.1186/gb-2010-11-3-r25

> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> normalized_counts = cpm(y, normalized.lib.sizes = TRUE)
> melted = melt(normalized_counts)
> colnames(melted) = c("gene", "sample", "count")
> melted$sample = factor(melted$sample)
> melted$sample = reorder(melted$sample, colnames(counts))
> melted$count = log(melted$count)
> ggplot(melted, aes(x = sample, y = count)) + geom_boxplot() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Density of log10 TMM-normalized counts

> ggplot(melted, aes(x = count, group = sample)) + geom_density() + theme_bw(base_size = 10) + 
+     theme(panel.grid.major = element_line(size = 0.5, color = "grey"), axis.text.x = element_text(angle = 90)) + 
+     xlab("")

Correlation (Pearson) heatmap of TMM-normalized counts

> heatmap_fn(cor(normalized_counts, method = "pearson"), fontsize = 6)

GC content seems to cluster with tissue type with the head and rest of body having lower GC content than the spermatheca, MAG and atrium.

Correlation (Spearman) heatmap of TMM-normalized counts

> heatmap_fn(cor(normalized_counts, method = "spearman"), fontsize = 6)

One sample, DT3_ALBI_Sp_M is very different from all of the other samples. This has the lower number of transcripts detected at 1.2k which is about 1/10th of the other samples. This sample likely has some kind of either contamination issue or PCR artifact happening, where a single gene or a small number of genes was sequenced repeatedly.

> kable(summarydata[summarydata$Name == "DT3_ALBI_Sp_M", ], format = "markdown")
Name X.GC Exonic.Rate Sequences.flagged.as.poor.quality rRNA.rate Fragment.Length.Mean Intronic.Rate Intergenic.Rate Mapping.Rate Quality.format Duplication.Rate.of.Mapped Mapped rRNA unique_starts_per_read Sequence.length Transcripts.Detected Mean.Per.Base.Cov. complexity Genes.Detected tissue status sex rep batch tissue_status
DT3_ALBI_Sp_M DT3_ALBI_Sp_M 51 0.5814477 0 0 101 0.2427143 0.175766 0.9899027 standard 0 9214889 0 NA 25-101 1216 79.68765 NA 1 spermatheca mated female D 2 spermatheca_mated

as opposed to the first five samples sequenced:

> kable(head(summarydata), format = "markdown")
Name X.GC Exonic.Rate Sequences.flagged.as.poor.quality rRNA.rate Fragment.Length.Mean Intronic.Rate Intergenic.Rate Mapping.Rate Quality.format Duplication.Rate.of.Mapped Mapped rRNA unique_starts_per_read Sequence.length Transcripts.Detected Mean.Per.Base.Cov. complexity Genes.Detected tissue status sex rep batch tissue_status
ACD_T3_RBm_Albi ACD_T3_RBm_Albi 36 0.3994357 0 0 113 0.1882835 0.4120508 0.5325388 standard 0 62323037 0 NaN 25-101 10322 47.74643 NA 1 body mated male A-C-D 1 body_mated
AT3_ALBI_Sp_M AT3_ALBI_Sp_M 50 0.5859818 0 0 154 0.1694706 0.2443366 0.5953684 standard 0 66183201 0 NaN 25-101 9434 62.89629 NA 1 spermatheca mated female A 2 spermatheca_mated
AT3_ALBI_Sp_V AT3_ALBI_Sp_V 52 0.6092181 0 0 283 0.1749786 0.2157042 0.6819525 standard 0 179442063 0 NaN 25-101 10027 175.74693 NA 1 spermatheca virgin female A 2 spermatheca_virgin
AT3_MAG_M_Albi AT3_MAG_M_Albi 45 0.5559447 0 0 137 0.2191971 0.2247786 0.8584414 standard 0 58087516 0 NaN 25-101 8959 33.30295 NA 1 MAG mated male A 1 MAG_mated
BCD_T3_RBv_Albi BCD_T3_RBv_Albi 35 0.3455879 0 0 113 0.2008765 0.4532733 0.5256571 standard 0 66953895 0 NaN 25-101 9731 44.50073 NA 1 body virgin male B-C-D 1 body_virgin
BT3_ALBI_At_M BT3_ALBI_At_M 45 0.4863401 0 0 178 0.1644467 0.3491574 0.7796990 standard 0 128394650 0 NaN 25-101 9823 137.04950 NA 1 atrium mated female B 1 atrium_mated

We should drop DT3_ALBI_Sp_M from the analysis.

> summarydata = summarydata[!rownames(summarydata) == "DT3_ALBI_Sp_M", ]
> counts = counts[, !colnames(counts) == "DT3_ALBI_Sp_M"]
> normalized_counts = normalized_counts[, !colnames(normalized_counts) == "DT3_ALBI_Sp_M"]

The overall exonic mapping rates is low:

> summary(summarydata$Exonic.Rate)

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.3103 0.4006 0.4863 0.4857 0.5889 0.6468

It isn’t clear if these issues are due to the fact the genome/annotation for the mosquito is poor or if there is a problem with the samples.

MDS plot of TMM-normalized counts

> mds(normalized_counts, k = length(colnames(normalized_counts)) - 1)

There is pretty consistent separation of the head with M/V status, and maybe some separation in MAG with M/V status, barring CT3, and an overall separation of tissue type. Interestingly, atrium and rest of body samples look pretty similar to each other.

Heatmap of top 30 most expressed genes

> select = order(rowMeans(counts), decreasing = TRUE)[1:30]
> heatmap_fn(counts[select, ])

Differential expression

We’ll look at differences that are driven by mating status in the same tissue. The way we will do this is to create factors that describe both the tissue status and mating status, and then do pairwise comparisons across those tissues. This will answer the question of what genes are different in each specific tissue.

> library(DESeq2)
> library(DEGreport)
> library(vsn)
> design = ~sex + tissue_status
> condition = "tissue"
> counts <- counts[rowSums(counts > 0) > 1, ]
> dds = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, design = design)
> dds = DESeq(dds)

Now that we have the differential expression calls using the tissue and status, we can pull out the mating specific differences in each tissue.

Atrium

> atrium = results(dds, contrast = list(c("tissue_statusatrium_mated"), c("tissue_statusatrium_virgin")))
> write.table(atrium, file = "atrium_deseq2.tsv", sep = "\t", row.names = TRUE, 
+     col.names = TRUE, quote = FALSE)
> atrium_de = subset(atrium, padj < 0.1)

There are 1067 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 370 upregulated in mated samples and 697 in atrium samples.

There is quite a bit of variability between samples:

> DESeq2::plotMA(atrium)

> atrium_stats = as.data.frame(atrium[, c(2, 6)])
> volcano_density_plot(atrium_stats, title = names(atrium_stats), lfc.cutoff = 1.5)

MAG

> MAG = results(dds, contrast = list(c("tissue_statusMAG_mated"), c("tissue_statusMAG_virgin")))
> write.table(MAG, file = "MAG_deseq2.tsv", sep = "\t", row.names = TRUE, col.names = TRUE, 
+     quote = FALSE)
> MAG_de = subset(MAG, padj < 0.1)

There are 1088 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 583 upregulated in mated samples and 505 in atrium samples.

There is quite a bit of variability between samples:

> DESeq2::plotMA(MAG)

> MAG_stats = as.data.frame(MAG[, c(2, 6)])
> volcano_density_plot(MAG_stats, title = names(MAG_stats), lfc.cutoff = 1.5)

body

> summarydata$sex_tissue_status = paste(summarydata$sex, summarydata$tissue_status, 
+     sep = "_")
> design_body = ~sex_tissue_status
> dds_body = DESeqDataSetFromMatrix(countData = counts, colData = summarydata, 
+     design = design_body)
> dds_body = DESeq(dds_body)
> female_body = results(dds_body, contrast = list(c("sex_tissue_statusfemale_body_mated"), 
+     c("sex_tissue_statusfemale_body_virgin")))
> male_body = results(dds_body, contrast = list(c("sex_tissue_statusmale_body_mated"), 
+     c("sex_tissue_statusmale_body_virgin")))
> write.table(male_body, file = "male_body_deseq2.tsv", sep = "\t", row.names = TRUE, 
+     col.names = TRUE, quote = FALSE)
> write.table(female_body, file = "female_body_deseq2.tsv", sep = "\t", row.names = TRUE, 
+     col.names = TRUE, quote = FALSE)
> male_body_de = subset(male_body, padj < 0.1)
> female_body_de = subset(female_body, padj < 0.1)

There are 0 genes flagged as differentally expressed between the atrium of mated and virgin samples in the male, with 0 upregulated in mated samples and 0 in the virgin samples samples.

There are 3 genes flagged as differentally expressed between the atrium of mated and virgin samples in the female, with 0 upregulated in mated samples and 3 in the virgin samples samples.

> DESeq2::plotMA(body)

> body_stats = as.data.frame(body[, c(2, 6)])
> volcano_density_plot(body_stats, title = names(body_stats), lfc.cutoff = 1.5)

spermatheca

> spermatheca = results(dds, contrast = list(c("tissue_statusspermatheca_mated"), 
+     c("tissue_statusspermatheca_virgin")))
> write.table(spermatheca, file = "spermatheca_deseq2.tsv", sep = "\t", row.names = TRUE, 
+     col.names = TRUE, quote = FALSE)
> spermatheca_de = subset(spermatheca, padj < 0.1)

There are 978 genes flagged as differentally expressed between the atrium of mated and virgin samples, with 655 upregulated in mated samples and 323 in atrium samples.

There is quite a bit of variability between samples:

> DESeq2::plotMA(spermatheca)

> spermatheca_stats = as.data.frame(spermatheca[, c(2, 6)])
> volcano_density_plot(spermatheca_stats, title = names(spermatheca_stats), lfc.cutoff = 1.5)

PCA

Samples tend to cluster on the PCA plot by tissue and to a lesser degree mated/virgin status, that is a good sign.

> rld = rlog(dds)
> plotPCA(rld, intgroup = c("tissue", "status"))

Effect of variance stabilization

There is a huge amount of dispersion between these samples.

> par(mfrow = c(1, 3))
> notAllZero <- (rowSums(counts(dds)) > 0)
> rld <- rlog(dds)
> vsd <- varianceStabilizingTransformation(dds)
> rlogMat <- assay(rld)
> vstMat <- assay(vsd)
> 
> meanSdPlot(log2(counts(dds, normalized = TRUE)[notAllZero, ] + 1), ylim = c(0, 
+     2.5))
> meanSdPlot(assay(rld[notAllZero, ]), ylim = c(0, 2.5))
> meanSdPlot(assay(vsd[notAllZero, ]), ylim = c(0, 2.5))

Dispersion estimates

> plotDispEsts(dds)

Tissue markers

One of the things we can do with this data is to produce a set of tissue-specific markers. We’re looking for a set a gene signatures that can be used to classify the different tissues.

We’ll do NMF to pull out gene signatures for the different individual tissues.

> cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", 
+     "#D55E00", "#CC79A7", "#000000")
> 
> library(edgeR)
> library(limma)
> library(NMF)
> design = ~sex + tissue_status + sex:tissue_status
> dge = DGEList(counts = counts)
> dge = calcNormFactors(y)
> voomed = voom(dge, plot = TRUE)

> # tissue_metadata = subset(summarydata, tissue != 'body')
> tissue_metadata <- summarydata
> tissue_only = normalized_counts[, colnames(normalized_counts) %in% rownames(tissue_metadata)]
> tissue_only = tissue_only[rowSums(tissue_only) > 1, ]
> adf_tissue = AnnotatedDataFrame(tissue_metadata[, c("sex", "tissue")], data.frame(labelDescription = c("sex", 
+     "tissue")))
> x = ExpressionSet(as.matrix(tissue_only), adf_tissue)
> nbasis <- length(unique(summarydata$tissue))
> n = nmf(x, nbasis, nrun = 100, .options = list(parallel = FALSE))
> consensus = consensusmap(n, annCol = x)

> basismap(n, scale = "r1", annColors = list(basis = cbPalette[4:8], consensus = brewer.pal(4, 
+     "Spectral")), main = "Metagene Components - All Contributing Genes", Rowv = TRUE)

> s = featureScore(n)
> summary(s)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
0.001792 0.075890 0.175600 0.269500 0.403700 1.000000 
> s = extractFeatures(n)
> str(s)
List of 5
 $ : int [1:41] 7694 3839 2517 785 3200 5900 1379 5426 6787 5979 ...
 $ : int [1:173] 9616 2207 1013 9743 9617 2221 11340 8458 5887 4557 ...
 $ : int [1:63] 513 2351 2713 2716 4356 4358 4360 4362 6550 2714 ...
 $ : int [1:91] 1690 6958 510 7882 508 7554 6957 509 2836 6660 ...
 $ : int [1:186] 604 1224 8524 9150 9592 8195 1718 3629 6050 5377 ...
 - attr(*, "method")= chr "kim"
> z = featureScore(n)
> 
> 
> numfeatures <- sapply(seq(0, 1, 0.01), function(num) {
+     lapply(extractFeatures(n, num), function(x) {
+         length(x)
+     })
+ })
> minnumfeatures = 20
> rel.basis.contrib.cutoff <- max(seq(0, 1, 0.01)[apply(numfeatures, 2, function(x) all(x > 
+     minnumfeatures))])
> 
> basismap(n, scale = "r1", subsetRow = rel.basis.contrib.cutoff, annColors = list(basis = cbPalette[4:8], 
+     consensus = brewer.pal(4, "Spectral")), main = "Metagene Components - Most Specific Genes")

> coefmap(n, scale = "c1", labCol = tissue_metadata$tissue, annColors = list(basis = cbPalette[4:8]))

> # write out the metagene features
> 
> fs = featureScore(n)
> features = extractFeatures(n, 0.9)
> head_features = fs[features[[5]]]
> head_features[is.na(head_features)] = 1
> head_counts = tissue_only[names(head_features), colnames(tissue_only) %in% subset(summarydata, 
+     tissue == "head")$Name]
> head_scores = head_features * log(rowMeans(head_counts))
> head_plot = sort(head_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(head_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")

> ggsave("head_metagene_plot.pdf")
> write.table(head_scores, file = "head_metagene_scores.txt", quote = FALSE, col.names = FALSE)
> 
> 
> atrium_features = fs[features[[1]]]
> atrium_features[is.na(atrium_features)] = 1
> atrium_counts = tissue_only[names(atrium_features), colnames(tissue_only) %in% 
+     subset(summarydata, tissue == "atrium")$Name]
> atrium_scores = atrium_features * log(rowMeans(atrium_counts))
> atrium_plot = sort(atrium_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(atrium_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")

> ggsave("atrium_metagene_plot.pdf")
> write.table(atrium_scores, file = "atrium_metagene_scores.txt", quote = FALSE, 
+     col.names = FALSE)
> 
> 
> spermatheca_features = fs[features[[4]]]
> spermatheca_features[is.na(spermatheca_features)] = 1
> spermatheca_counts = tissue_only[names(spermatheca_features), colnames(tissue_only) %in% 
+     subset(summarydata, tissue == "spermatheca")$Name]
> spermatheca_scores = spermatheca_features * log(rowMeans(spermatheca_counts))
> spermatheca_plot = sort(spermatheca_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(spermatheca_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")

> ggsave("spermatheca_metagene_plot.pdf")
> write.table(spermatheca_scores, file = "spermatheca_metagene_scores.txt", quote = FALSE, 
+     col.names = FALSE)
> 
> MAG_features = fs[features[[3]]]
> MAG_features[is.na(MAG_features)] = 1
> MAG_counts = tissue_only[names(MAG_features), colnames(tissue_only) %in% subset(summarydata, 
+     tissue == "MAG")$Name]
> MAG_scores = MAG_features * log(rowMeans(MAG_counts))
> MAG_plot = sort(MAG_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(MAG_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")

> ggsave("MAG_metagene_plot.pdf")
> write.table(MAG_scores, file = "MAG_metagene_scores.txt", quote = FALSE, col.names = FALSE)
> 
> body_features = fs[features[[2]]]
> body_features[is.na(body_features)] = 1
> body_counts = tissue_only[names(body_features), colnames(tissue_only) %in% subset(summarydata, 
+     tissue == "body")$Name]
> body_scores = body_features * log(rowMeans(body_counts))
> body_plot = sort(body_scores, decreasing = TRUE)[1:50]
> melted = melt(tissue_only[names(body_plot), ])
> m = merge(melted, summarydata, by.x = "X2", by.y = "Name")
> ggplot(m, aes(X1, value, color = tissue)) + geom_point() + scale_y_log10() + 
+     theme_bw(base_size = 10) + theme(panel.grid.major = element_line(size = 0.5, 
+     color = "grey"), axis.text.x = element_text(angle = 90)) + xlab("") + ylab("score")

> ggsave("body_metagene_plot.pdf")
> write.table(body_scores, file = "body_metagene_scores.txt", quote = FALSE, col.names = FALSE)

We can see that the metagenes separate out the different body parts for the most part, except for a single body sample which has a signature more similar to the atrium than the body samples. Could that sample have both atrium and the rest of body in it?

Digging into AALB000422, AALB000423 and AALB014036

Why are 422 and 423 missing? It looks like they are only expressed in MAG, and are expressed at an extremely high level in MAG.

> missing_genes = c("AALB000422", "AALB000423")
> counts[missing_genes, ]
           ACD_T3_RBm_Albi AT3_ALBI_Sp_M AT3_ALBI_Sp_V AT3_MAG_M_Albi
AALB000422               0            12            65         874156
AALB000423               0             0            12         317583
           BCD_T3_RBv_Albi BT3_ALBI_At_M BT3_ALBI_At_V BT3_ALBI_H_M
AALB000422               1             0             0            0
AALB000423               5             0             0            0
           BT3_ALBI_H_V BT3_ALBI_RB_M BT3_ALBI_RB_V BT3_ALBI_Sp_M
AALB000422            0             0             0            47
AALB000423            0             0             0             9
           BT3_ALBI_Sp_V BT3_MAG_V_Albi CT3_ALBI_At_M CT3_ALBI_At_V
AALB000422             0         307223             0             0
AALB000423             0          40297             0             0
           CT3_ALBI_H_M CT3_ALBI_H_V CT3_ALBI_RB_M CT3_ALBI_RB_V
AALB000422            0            0             0             0
AALB000423            0            0             0             0
           CT3_MAG_M_Albi CT3_MAG_V_Albi DT3_ALBI_At_M DT3_ALBI_At_V
AALB000422        1107671         594214             1             1
AALB000423         177929          52428             2             0
           DT3_ALBI_H_M DT3_ALBI_H_V DT3_ALBI_RB_M DT3_ALBI_RB_V
AALB000422            0            0             0             0
AALB000423            0            0             0             0
           DT3_ALBI_Sp_V DT3_MAG_M_Albi DT3_MAG_V_Albi
AALB000422             0        1781563         736322
AALB000423             0         465594         108771
> MAG[missing_genes, ]
log2 fold change (MAP): tissue_statusMAG_mated vs tissue_statusMAG_virgin 
Wald test p-value: tissue_statusMAG_mated vs tissue_statusMAG_virgin 
DataFrame with 2 rows and 6 columns
            baseMean log2FoldChange     lfcSE      stat    pvalue
           <numeric>      <numeric> <numeric> <numeric> <numeric>
AALB000422 240320.86       1.291841  1.128429  1.144813        NA
AALB000423  53559.16       2.234039  1.445440  1.545577 0.1222067
                padj
           <numeric>
AALB000422        NA
AALB000423 0.4165237

We can see there is a high SE for the log fold change, which is why these are not being called significant, it isn’t that these genes aren’t expressed.

> old_counts = read.table("/Volumes/Clotho/Users/rory/cache/mosquito_project/data/albimanus/combined.counts", 
+     header = TRUE, sep = "\t", row.names = 1)
> colnames(old_counts) = c("AT3_MAG_M_Albi", "BT3_ALBI_At_M", "BT3_ALBI_At_V", 
+     "BT3_MAG_V_Albi", "CT3_ALBI_At_M", "CT3_ALBI_At_V", "CT3_MAG_M_Albi", "CT3_MAG_V_Albi", 
+     "DT3_ALBI_At_M", "DT3_ALBI_At_V", "DT3_MAG_M_Albi", "DT3_MAG_V_Albi")
> old_MAG = read.table("/Volumes/Clotho/Users/rory/cache/mosquito_project/scripts/albimanus_MAG.tsv", 
+     sep = "\t", header = TRUE)
> library(reshape)
> melted_counts = melt(as.matrix(counts))
> colnames(melted_counts) = c("gene", "sample", "count")
> melted_counts$run = "new"
> 
> old_melted_counts = melt(as.matrix(old_counts))
> colnames(old_melted_counts) = c("gene", "sample", "count")
> old_melted_counts$run = "old"
> head(old_melted_counts)
        gene         sample count run
1 AALB014361 AT3_MAG_M_Albi    22 old
2 AALB003697 AT3_MAG_M_Albi    45 old
3 AALB003698 AT3_MAG_M_Albi   427 old
4 AALB014362 AT3_MAG_M_Albi    39 old
5 AALB014363 AT3_MAG_M_Albi     0 old
6 AALB003699 AT3_MAG_M_Albi    16 old
> in_both = rbind(melted_counts[melted_counts$sample %in% old_melted_counts$sample, 
+     ], old_melted_counts)
> mcompare = merge(melted_counts, old_melted_counts, by.x = c("gene", "sample"), 
+     by.y = c("gene", "sample"), all = FALSE)

We can see below that in general we have more reads aligning to each gene with the new analysis. This is likely to be partially due improvements in the aligner and improvements in the gene models.

> ggplot(mcompare, aes(count.x, count.y)) + geom_point() + xlab("new") + ylab("old") + 
+     scale_y_sqrt() + scale_x_sqrt()

Looking at the old code that just had the atrium and MAG to compare, we did the same thing where we built the model using all of the samples. We used limma last time to call differential expression instead of DESeq2 which is another source of difference; we could do the same thing again for this sample and see if it is a DESeq2 vs limma difference or not.

> library(edgeR)
> design = model.matrix(~0 + tissue_status, data = summarydata)
> colnames(design) = gsub("tissue_status", "", colnames(design))
> y = DGEList(counts = counts)
> y = calcNormFactors(y)
> v = voom(y, design, plot = TRUE)

> fit_limma = lmFit(v, design)
> cm = makeContrasts(atrium = atrium_mated - atrium_virgin, MAG = MAG_mated - 
+     MAG_virgin, head = head_mated - head_virgin, spermatheca = spermatheca_mated - 
+     spermatheca_virgin, levels = design)
> fit_limma2 = contrasts.fit(fit_limma, cm)
> fit_limma2 = eBayes(fit_limma2)
> MAG_results_limma = topTable(fit_limma2, n = Inf, p.value = 1, coef = "MAG")
> MAG_results_limma[missing_genes, ]
              logFC    AveExpr        t     P.Value  adj.P.Val          B
AALB000422 1.380099 -0.2388494 2.090200 0.047002900 0.52990333 -4.1047694
AALB000423 2.439860 -0.9987175 3.700783 0.001072736 0.06534383 -0.7726864

ALB000422 gets knocked out of being significant if you use limma instead of DESeq2, but AALB000423 doesn’t. If we just do what we did before, include only the atirum and the MAG results, they are both significant.

> library(edgeR)
> summarysubset = subset(summarydata, tissue %in% c("atrium", "MAG"))
> countssubset = counts[, colnames(counts) %in% summarysubset$Name]
> summarysubset$tissue_status = as.character(summarysubset$tissue_status)
> summarysubset$tissue = as.character(summarysubset$tissue)
> summarysubset$status = as.character(summarysubset$status)
> design = model.matrix(~0 + tissue_status, data = summarysubset)
> colnames(design) = gsub("tissue_status", "", colnames(design))
> y = DGEList(counts = countssubset)
> y = calcNormFactors(y)
> v = voom(y, design, plot = FALSE)
> fit_limma_subset = lmFit(v, design)
> cm = makeContrasts(atrium = atrium_mated - atrium_virgin, MAG = MAG_mated - 
+     MAG_virgin, levels = design)
> fit_limma_subset2 = contrasts.fit(fit_limma_subset, cm)
> fit_limma_subset2 = eBayes(fit_limma_subset2)
> MAG_results_limma_subset = topTable(fit_limma_subset2, n = Inf, p.value = 1, 
+     coef = "MAG")
> MAG_results_limma_subset[missing_genes, ]
              logFC  AveExpr        t      P.Value  adj.P.Val         B
AALB000422 1.417859 5.467304 4.159149 0.0015387210 0.07765818 -1.013592
AALB000423 2.472445 4.127719 5.662333 0.0001376023 0.02262663  1.358130

However quantile normalizing the voom counts also has an effect:

> library(edgeR)
> summarysubset = subset(summarydata, tissue %in% c("atrium", "MAG"))
> countssubset = counts[, colnames(counts) %in% summarysubset$Name]
> summarysubset$tissue_status = as.character(summarysubset$tissue_status)
> summarysubset$tissue = as.character(summarysubset$tissue)
> summarysubset$status = as.character(summarysubset$status)
> design = model.matrix(~0 + tissue_status, data = summarysubset)
> colnames(design) = gsub("tissue_status", "", colnames(design))
> y = DGEList(counts = countssubset)
> y = calcNormFactors(y)
> v = voom(y, design, plot = FALSE, normalize = "quantile")
> fit_limma_subset_quantile = lmFit(v, design)
> cm = makeContrasts(atrium = atrium_mated - atrium_virgin, MAG = MAG_mated - 
+     MAG_virgin, levels = design)
> fit_limma_subset_quantile2 = contrasts.fit(fit_limma_subset_quantile, cm)
> fit_limma_subset_quantile2 = eBayes(fit_limma_subset_quantile2)
> MAG_results_limma_subset_quantile = topTable(fit_limma_subset_quantile2, n = Inf, 
+     p.value = 1, coef = "MAG")
> MAG_results_limma_subset_quantile[missing_genes, ]
               logFC  AveExpr         t     P.Value adj.P.Val         B
AALB000422 0.1981414 4.956242 0.5290292 0.609346864 0.9530702 -6.701848
AALB000423 1.1355503 3.925534 3.3622285 0.008137943 0.1683863 -2.862057

So there are a set of genes that are pretty sensitive to how we do the analysis; if we swap callers, or how we combine them together or how we do the normalization, it matters. These are a set of genes that are not robust changes. We can make reasonable arguments for doing any of these.

> MAG <- MAG[order(rownames(MAG)), ]
> MAG_results_limma <- MAG_results_limma[order(rownames(MAG_results_limma)), ]
> MAG_results_limma_subset <- MAG_results_limma_subset[order(rownames(MAG_results_limma_subset)), 
+     ]
> MAG_results_limma_subset_quantile <- MAG_results_limma_subset_quantile[order(rownames(MAG_results_limma_subset_quantile)), 
+     ]

The actual fold changes between doing an analysis on the reduced dataset vs. the full one are not much different.

> qplot(MAG_results_limma$logFC, MAG_results_limma_subset$logFC) + xlab("full model") + 
+     ylab("reduced model") + theme_bw() + theme(text = element_text(family = "Gill Sans", 
+     size = 10), strip.background = element_rect(fill = "white"))

> qplot((MAG_results_limma$logFC + MAG_results_limma_subset$logFC)/2, MAG_results_limma$logFC - 
+     MAG_results_limma_subset$logFC, geom = "hex") + xlab("average logFC") + 
+     ylab("full logFC - reduced logFC") + theme_bw() + theme(text = element_text(family = "Gill Sans", 
+     size = 10), strip.background = element_rect(fill = "white")) + ggtitle("bland-altman plot")

But the calculated FDR is very different:

> qplot(MAG_results_limma$adj.P.Val, MAG_results_limma_subset$adj.P.Val) + xlab("full model") + 
+     ylab("reduced model")

> qplot((MAG_results_limma$logFC + MAG_results_limma_subset$logFC)/2, MAG_results_limma$logFC - 
+     MAG_results_limma_subset$logFC) + xlab("average logFC") + ylab("full logFC - reduced logFC")

> qplot((MAG_results_limma$P.Value + MAG_results_limma_subset$P.Value)/2, MAG_results_limma$P.Value - 
+     MAG_results_limma_subset$P.Value) + xlab("average pvalue") + ylab("full pvalue - reduced pvalue")

> qplot((fit_limma$sigma + fit_limma_subset$sigma)/2, fit_limma$sigma - fit_limma_subset$sigma) + 
+     stat_binhex() + scale_x_sqrt() + xlab("average variance") + ylab("full - reduced variance")

> in_either = unique(c(rownames(subset(MAG_results_limma, adj.P.Val < 0.1)), rownames(subset(MAG_results_limma_subset, 
+     adj.P.Val < 0.1))))
> 
> signew = MAG_results_limma[in_either, ]
> signew$run = "full"
> signew$sig = signew$adj.P.Val < 0.1
> sigold = MAG_results_limma_subset[in_either, ]
> sigold$run = "subset"
> sigold$sig = sigold$adj.P.Val < 0.1
> 
> m = merge(signew, sigold, by = "row.names")
> m$both = ifelse(m$sig.x & m$sig.y, "both", ifelse(m$sig.x, "full only", "subset only"))
> m$minpval <- pmin(m$adj.P.Val.x, m$adj.P.Val.y)
> ggplot(m, aes((AveExpr.x + AveExpr.y)/2, (AveExpr.x - AveExpr.y), size = minpval)) + 
+     geom_point(alpha = 0.6) + xlab("average expression") + ylab("expression difference") + 
+     facet_wrap(~both) + scale_size(range = c(0.3, 3)) + element_blank() + guides(colour = guide_legend(override.aes = list(alpha = 1), 
+     title = "found in"))

> allsig = rbind(signew, sigold)
> allsig$sig = allsig$adj.P.Val < 0.1
> allsig$id = rownames(allsig)
> library(dplyr)
> 
> ggplot(allsig, aes(AveExpr, logFC, shape = run, color = sig)) + geom_point()

limma

Write out the counts and subsetted summardata file for use with limma.

> write.table(file = "limma.counts", counts, col.names = TRUE, row.names = TRUE, 
+     quote = FALSE, sep = "\t")
> write.table(file = "limma.summarydata.txt", summarydata, col.names = TRUE, row.names = TRUE, 
+     quote = FALSE, sep = "\t")

Ontological analysis setup

These are some helper functions we defined to do a GO ontology of the results of a DESeq2 results dataframe.

> library(biomaRt)
> library(GOstats)
> library(GSEABase)
> mart = useMart(biomart = "vb_gene_mart_1512", host = "biomart.vectorbase.org")
> albimanus = useDataset(mart, dataset = "aalbimanus_eg_gene")
> bm = getBM(mart = albimanus, attributes = c("ensembl_gene_id", "transcript_biotype", 
+     "go_name_1006", "go_accession", "go_namespace_1003", "external_gene_id", 
+     "go_linkage_type"))
> goframeData = bm[, c("go_accession", "go_linkage_type", "ensembl_gene_id")]
> colnames(goframeData) = c("frame.go_id", "frame.Evidence", "frame.gene_id")
> goframeData = subset(goframeData, frame.Evidence != "")
> goFrame = GOFrame(goframeData)
> goAllFrame = GOAllFrame(goFrame)
> gsc = GeneSetCollection(goAllFrame, setType = GOCollection())
> 
> run_go = function(universe, genes, gsc) {
+     # given a list of gene ids of the gene universe and the enriched genes and a
+     # genesetcollection of ontology terms, find enrichment for molecular
+     # function, biological process and cellular component
+     params = GSEAGOHyperGParams(name = "foo", geneSetCollection = gsc, geneIds = genes, 
+         universeGeneIds = universe, ontology = "MF", pvalueCutoff = 0.1, conditional = FALSE, 
+         testDirection = "over")
+     over = hyperGTest(params)
+     mf = summary(over)
+     colnames(mf)[1] = "GOID"
+     params = GSEAGOHyperGParams(name = "foo", geneSetCollection = gsc, geneIds = genes, 
+         universeGeneIds = universe, ontology = "BP", pvalueCutoff = 0.1, conditional = FALSE, 
+         testDirection = "over")
+     over = hyperGTest(params)
+     bp = summary(over)
+     colnames(bp)[1] = "GOID"
+     params = GSEAGOHyperGParams(name = "foo", geneSetCollection = gsc, geneIds = genes, 
+         universeGeneIds = universe, ontology = "CC", pvalueCutoff = 0.1, conditional = FALSE, 
+         testDirection = "over")
+     over = hyperGTest(params)
+     cc = summary(over)
+     colnames(cc)[1] = "GOID"
+     df = data.frame()
+     if (nrow(mf) > 0) {
+         mf$ontology = "MF"
+         df = rbind(df, mf)
+     }
+     if (nrow(cc) > 0) {
+         cc$ontology = "CC"
+         df = rbind(df, cc)
+     }
+     if (nrow(bp) > 0) {
+         bp$ontology = "BP"
+         df = rbind(df, bp)
+     }
+     return(df)
+ }
> 
> deseq_go = function(deseqres, gsc) {
+     # run GO ontology analysis using GOstats from a DESeq dataframe and a gene
+     # set collection
+     universe = rownames(subset(deseqres, baseMean > 10))
+     genes = rownames(subset(deseqres, baseMean > 10 & padj < 0.1))
+     return(run_go(universe, genes, gsc))
+ }

Here we looked at GO term enrichment for differentially expressed genes in mated compared to virgin tissues within each tissue.

This performs a hypergeometric test for enriched genes against a background set. The background set of genes are all genes expressed in the tissue. We defined expressed as having a baseMean expression > 10. The enriched genes are defined as genes with a baseMean expression > 10 and an adjusted p-value for differential expression with mated/virgin status of < 0.1. We saved these as atrium_go.tsv, head_go.tsv, etc.

> atrium_go = deseq_go(atrium, gsc)
> write.table(atrium_go, file = "atrium_go.tsv", sep = "\t", row.names = FALSE, 
+     col.names = TRUE, quote = FALSE)
> MAG_go = deseq_go(MAG, gsc)
> write.table(MAG_go, file = "MAG_go.tsv", sep = "\t", row.names = FALSE, col.names = TRUE, 
+     quote = FALSE)
> head_go = deseq_go(head, gsc)
> write.table(head_go, file = "head_go.tsv", sep = "\t", row.names = FALSE, col.names = TRUE, 
+     quote = FALSE)
> spermatheca_go = deseq_go(spermatheca, gsc)
> write.table(spermatheca_go, file = "spermatheca_go.tsv", sep = "\t", row.names = FALSE, 
+     col.names = TRUE, quote = FALSE)